This is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers.
http://archive.ics.uci.edu/ml/datasets/Online+Retail
In [2]:
#Put the csv into an RDD (at first, each row in the RDD is a string which
#correlates to a line in the csv
retailData = sc.textFile("OnlineRetail.csv")
print retailData.take(2)
In [ ]:
from pyspark.mllib.recommendation import ALS, Rating
import re
#Remove the header from the RDD
header = retailData.first()
retailData = retailData.filter(lambda line: line != header)
#To produce the ALS model, we need to train it with each individual
#purchase. Each record in the RDD must be the customer id,
#item id, and the rating. In this case, the rating is the quantity
#ordered. MLlib converts these into a sparce, unfactored matrix.
retailData = retailData.map(lambda l: l.split(",")).\
filter(lambda l: int(l[3]) > 0 and len(re.sub("\D", "", l[1])) != 0 and len(l[6]) != 0).\
map(lambda l: (int(l[6]),int(re.sub("\D", "", l[1])),int(l[3])))
#Randomly split the data into a testing set and a training set
testRDD, trainRDD = retailData.randomSplit([.2,.8])
trainData = trainRDD.map(lambda l: Rating(l[0],l[1],l[2]))
print trainData.take(2)
print
print testRDD.take(2)
In [ ]:
#Use trainging RDD to train a model with Alternating Least Squares
#rank=5
#5 columns in the user-feature and product-feature matricies
#iterations=10
#10 factorization runs
rank = 5
numIterations = 10
model = ALS.train(trainData, rank, numIterations)
print "The model has been trained"
In [ ]:
#Evaluate the model with the test rdd by using the predictAll function
predict = model.predictAll(testRDD.map(lambda l: (l[0],l[1])))
#Calculate and print the Mean Squared Error
predictions = predict.map(lambda l: ((l[0],l[1]), l[2]))
ratingsAndPredictions = testRDD.map(lambda l: ((l[0], l[1]), l[2])).join(predictions)
ratingsAndPredictions.cache()
print ratingsAndPredictions.take(3)
meanSquaredError = ratingsAndPredictions.map(lambda l: (l[1][0] - l[1][1])**2).mean()
print
print 'Mean squared error = %.4f' % meanSquaredError
In [ ]:
recs = model.recommendProducts(15544,5)
for rec in recs:
print rec
In [ ]:
#Rating(user=15544, product=84568, rating=193.03195106065823)
#GIRLS ALPHABET IRON ON PATCHES
#Rating(user=15544, product=16033, rating=179.45915040198466)
#MINI HIGHLIGHTER PENS
#Rating(user=15544, product=22266, rating=161.04293255928698)
#EASTER DECORATION HANGING BUNNY
#Rating(user=15544, product=84598, rating=141.00162368678377)
#BOYS ALPHABET IRON ON PATCHES
#Rating(user=15544, product=72803, rating=129.54033486738518)
#ROSE SCENT CANDLE JEWELLED DRAWER
Daqing Chen, Sai Liang Sain, and Kun Guo, Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197–208, 2012 (Published online before print: 27 August 2012. doi: 10.1057/dbm.2012.17).